Pangenome homology groups summary

Author

Lakhansing Pardeshi

Published

January 17, 2025

Initial setup

Code
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(configr))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(matrixStats))
suppressPackageStartupMessages(library(org.Pectobacterium.spp.pan.eg.db))

rm(list = ls())

source("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R")
source("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/GO_enrichment/enrichment_functions.R")
source("scripts/utils/config_functions.R")
source("scripts/utils/phylogeny_functions.R")
source("scripts/utils/homology_groups.R")
################################################################################
set.seed(124)

confs <- prefix_config_paths(
  conf = suppressWarnings(configr::read.config(file = "project_config.yaml")),
  dir = "."
)

pangenome <- confs$data$pangenomes$pectobacterium.v2$name
panConf <- confs$data$pangenomes[[pangenome]]

analysisName <- "homology_groups"
outDir <- confs$analysis$homology_groups$dir
outPrefix <- file.path(outDir, analysisName)
panOrgDb <- org.Pectobacterium.spp.pan.eg.db

Import data

Code
sampleInfo <- get_metadata(file = panConf$files$metadata, genus = confs$genus)

sampleInfoList <- as.list_metadata(
  df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId
)

spOrder <- suppressMessages(
  readr::read_tsv(confs$analysis$phylogeny$core_snp_ml$files$species_order)
)

hgTable <- AnnotationDbi::select(
  x = panOrgDb, keys = keys(panOrgDb),
  columns = c("GID", "genomeId", "class")
) %>%
  dplyr::rename(hgId = GID) %>%
  dplyr::count(hgId, genomeId, name = "nGenes")
'select()' returned 1:many mapping between keys and columns
Code
hgMeta <- AnnotationDbi::select(
  x = panOrgDb, keys = keys(panOrgDb),
  columns = c("GID", "class")
) %>%
  dplyr::rename(hgId = GID) %>%
  dplyr::mutate(
    class = dplyr::if_else(
      condition = class == "core & single copy orthologous",
      true = "core", false = class
    )
  )
'select()' returned 1:1 mapping between keys and columns
Code
## pangenome version HG stats
grpStats <- suppressMessages(readr::read_tsv(file.path(outDir, "group_stats.tab"))) %>% 
  dplyr::mutate(
    data = forcats::as_factor(data),
    group = forcats::fct_relevel(group, "core", "accessory", "unique")
  )

## binary matrix for homology_group PAV
hgPavMat <- homology_groups_mat(pandb = panOrgDb, type = "pav")
'select()' returned 1:many mapping between keys and columns

Species level homology group statistics

Code
spNames <- dplyr::count(sampleInfo, SpeciesName) %>%
  # dplyr::filter(n >= 20) %>%
  dplyr::pull(SpeciesName)

sppGrpStats <- NULL
sppGrpGo <- NULL

# matrixStats::colSums2(x = hgPavMat, useNames = T) %>%
#   tibble::enframe(name = "hgId", value = "nGenomes") %>%
#   dplyr::mutate(
#     class = dplyr::case_when(
#       nGenomes == !!nrow(hgPavMat) ~ "core",
#       nGenomes == 1 ~ "unique",
#       nGenomes < !!nrow(hgPavMat) & nGenomes > 1 ~ "accessory"
#     )
#   ) %>% 
#   dplyr::count(class)

## get species wise core, accessory, unique group stats and GO
for (sp in spNames) {
  spGenomes <- dplyr::filter(sampleInfo, SpeciesName == .env$sp) %>%
    dplyr::pull(genomeId)
  
  cat(sp, ": ", length(spGenomes), "\n")
  
  hgSum <- matrixStats::colSums2(
    x = hgPavMat, useNames = T,
    rows = which(rownames(hgPavMat) %in% spGenomes)
  ) %>%
    tibble::enframe(name = "hgId", value = "nGenomes") %>%
    dplyr::filter(nGenomes != 0) %>%
    dplyr::mutate(
      subpan_class = dplyr::case_when(
        nGenomes == !!length(spGenomes) ~ "core",
        nGenomes == 1 ~ "unique",
        nGenomes < !!length(spGenomes) & nGenomes > 1 ~ "accessory"
      )
    ) %>% 
    dplyr::left_join(
      y = dplyr::rename(hgMeta, pangenome_class = class),
      by = "hgId"
    )
  
  ## group stats
  sppGrpStats <- dplyr::count(hgSum, pangenome_class, subpan_class, name = "count") %>%
    dplyr::mutate(
      SpeciesName = .env$sp, nHgs = !!nrow(hgSum),
      n_genomes = length(spGenomes)
    ) %>%
    dplyr::bind_rows(sppGrpStats)

}
P. actinidiae :  5 
P. aquaticum :  7 
P. aroidearum :  33 
P. atrosepticum :  16 
P. betavasculorum :  2 
P. brasiliense :  138 
P. cacticida :  1 
P. carotovorum :  33 
P. colocasium :  1 
P. fontis :  1 
P. odoriferum :  21 
P. parmentieri :  37 
P. parvum :  10 
P. peruviense :  4 
P. polaris :  25 
P. polonicum :  3 
P. punjabense :  9 
P. quasiaquaticum :  8 
P. versatile :  93 
P. wasabiae :  3 
P. zantedeschiae :  3 
Pectobacterium sp. CFBP8739 :  1 

Proportion of genus pangenome HGs in the species pangenome

Code
hgStats <- dplyr::count(hgMeta, class, name = "count") %>%
  dplyr::mutate(
    SpeciesName = "Pangenome",
    nHgs = sum(count),
    pangenome_class = class,
    subpan_class = class,
    n_genomes = nrow(hgPavMat)
  ) %>%
  dplyr::bind_rows(sppGrpStats) %>%
  dplyr::select(SpeciesName, n_genomes, pangenome_class, subpan_class, count, nHgs) %>% 
  dplyr::mutate(
    fraction = round(count / nHgs, digits = 3),
    dplyr::across(
      .cols = c(subpan_class, pangenome_class),
      .fns = ~ forcats::fct_relevel(.x, "core", "accessory", "unique")
    ),
    SpeciesName = forcats::fct_relevel(SpeciesName, "Pangenome", !!!spOrder$SpeciesName)
  ) %>% 
  dplyr::arrange(desc(n_genomes), SpeciesName, pangenome_class, subpan_class)

readr::write_tsv(
  x = hgStats, file = confs$analysis$homology_groups$files$spp_group_stats
)

Plot the data

Code
species_to_show <- dplyr::count(sampleInfo, SpeciesName) %>%
  dplyr::filter(n >= 20) %>% 
  dplyr::arrange(desc(n)) %>% 
  dplyr::mutate(
    SpeciesName = forcats::as_factor(SpeciesName)
  )

pt_stats <- dplyr::left_join(
  x = species_to_show, y = hgStats, by = "SpeciesName"
) %>% 
  ggplot() +
  geom_bar(
    mapping = aes(
      x = count, y = SpeciesName,
      fill = forcats::fct_rev(subpan_class)
    ),
    stat = "sum", position = position_dodge(), width = 0.8
  ) +
  scale_fill_manual(
    values = c(
      confs$colors$core, confs$colors$accessory, confs$colors$unique
    ),
    breaks = c("core", "accessory", "unique")
  ) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.01))) +
  theme_bw(base_size = 16) +
  theme(
    panel.grid = element_blank(),
    axis.title = element_blank(),
    axis.text = element_text(face = "bold"),
    axis.text.y = element_text(face = "bold.italic"),
    legend.position = "bottom",
    legend.title = element_blank()
  )


ggsave(
  plot = pt_stats, filename = file.path(outDir, "pangenome_spp_stats.pdf"),
  width = 6, height = 4
)

pt_stats

Homology group counts

Compare pangenome versions

Code
pt_hgStats <- ggplot2::ggplot(grpStats) +
  geom_bar(
    mapping = aes(x = data, y = count, fill = forcats::fct_rev(group)),
    stat = "identity",
    position = position_stack()
  ) +
  scale_y_continuous(expand = expansion(add = c(0, 100))) +
  scale_fill_manual(
    name = NULL,
    values = c("core" = confs$colors$core, "accessory" = confs$colors$accessory,
               "unique" = confs$colors$unique)
    ) +
  theme_bw(base_size = 24) +
  theme(
    panel.grid = element_blank(),
    legend.position = "bottom",
    axis.title = element_blank(),
    axis.text.x = element_text(face = "italic")
  )

ggsave(filename = file.path(outDir, "hg_cmp_stats.pdf"), plot = pt_hgStats, width = 6, height = 6)

pt_hgStats

Pangenome version comparison

Heap’s law alpha for multiple species in pangenome

Code
heaps <- suppressMessages(readr::read_tsv(file.path(outDir, "heaps_law.tab"))) %>%
  dplyr::filter(species != "pangenome") %>%
  dplyr::mutate(
    complete = "complete",
    species = forcats::fct_relevel(species, !!!levels(species_to_show$SpeciesName))
  )

pt_alpha <- ggplot(data = heaps) +
  geom_bar(
    mapping = aes(y = species, x = alpha),
    fill = "black", stat = "identity", width = 0.8
  ) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.01))) +
  theme_bw(base_size = 16) +
  theme(
    panel.grid = element_blank(),
    axis.title = element_blank(),
    axis.text = element_text(face = "bold"),
    axis.text.y = element_text(face = "bold.italic")
  )


ggsave(
  plot = pt_alpha, filename = file.path(outDir, "pangenome_spp_alpha.pdf"),
  width = 5, height = 4
)

pt_alpha

Heaps law alpha

Species level GO enrichment results

Code
panGo <- suppressMessages(
  readr::read_tsv(confs$analysis$homology_groups$files$spp_group_go)
) %>% 
  dplyr::filter(SpeciesName == "pangenome")

ptDf <- dplyr::group_by(panGo, class) %>%
  dplyr::arrange(pvalue, .by_group = TRUE) %>%
  dplyr::slice(1:8) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    class = forcats::fct_relevel(class, "core", "accessory", "unique")
  )

pt_go <- enrichment_bar(df = ptDf, title = "pangenome GO", colorCol = "class")

pt_go <- pt_go +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +
  scale_x_discrete(labels = label_wrap(60)) +
  scale_fill_manual(
    values = c(
      "core" = confs$colors$core, "accessory" = confs$colors$accessory,
      "unique" = confs$colors$unique
    )
  )

ggsave(
  plot = pt_go, filename = file.path(outDir, "pangenome_GO.pdf"),
  width = 8, height = 6
)

pt_go

Homology group GO enrichment
Back to top